{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import os\n",
    "import pandas as pd\n",
    "import time\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "from sklearn.neighbors import NearestNeighbors\n",
    "\n",
    "from sklearn.manifold import TSNE\n",
    "from umap import UMAP\n",
    "from trimap import TRIMAP\n",
    "from pacmap import PaCMAP\n",
    "\n",
    "from processing.load_datasets import load_datasets\n",
    "from processing.dr_eval import run_eval\n",
    "from processing.configs import SINGLE_COLOR"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "datasets_names = [\"rts\", \"pdl\", \"ioc\", \"mjf\"]\n",
    "datasets = load_datasets(datasets_names)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Run DR"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Optimal values for the hyperparameters found heuristically\n",
    "c = 100\n",
    " \n",
    "dr_algos = {\n",
    "    \"tsne\": TSNE(n_components=2, perplexity=50),\n",
    "    \"umap\": UMAP(n_components=2, min_dist=0.5, n_neighbors=30),\n",
    "    \"trimap\": TRIMAP(n_dims=2, n_inliers=2*c, n_outliers=c, n_random=c),\n",
    "    \"pacmap\": PaCMAP(n_components=2, n_neighbors=30, MN_ratio=5.0, FP_ratio=5.0)\n",
    "}\n",
    "\n",
    "algo_names = list(dr_algos.keys())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def run_dr(X, nn, dataset_name, algo_name, \n",
    "           N_SAMPLE = -1, \n",
    "           n_eval_runs = 1, \n",
    "           n_sample = 1000, \n",
    "           n_repetitions = 10, \n",
    "           n_neighbors = 100, \n",
    "           RUN_EVALUATION = True,\n",
    "           SAVE_EMBEDDINGS = False):   \n",
    "    \n",
    "    if not os.path.exists(\"embeddings\"):\n",
    "        os.makedirs(\"embeddings\")\n",
    "    embeddings_path = f\"embeddings/embeddings_{dataset_name}_{algo_name}_N{N_SAMPLE}.npy\"    \n",
    "        \n",
    "    evaluation_results = {}    \n",
    "    if not RUN_EVALUATION:\n",
    "        n_eval_runs = 1\n",
    "        print(\"Evaluation is turned off. Only DR embeddings will be computed.\")\n",
    "        if os.path.exists(embeddings_path):\n",
    "            print(\"Embeddings already computed. Loading...\")\n",
    "            return {f\"{dataset_name}_{algo_name}\": np.load(embeddings_path)}\n",
    "        \n",
    "    algo = dr_algos[algo_name]\n",
    "    embeddings = []\n",
    "    csv_path = f\"results/{dataset_name}/eval_{dataset_name}_{algo_name}_N{N_SAMPLE}.csv\"\n",
    "    for i in range(n_eval_runs):\n",
    "        # Compute DR embedding\n",
    "        time_start = time.time()\n",
    "        embedding = algo.fit_transform(X)\n",
    "        time_stop = time.time()\n",
    "        embeddings.append(embedding)\n",
    "        \n",
    "        # Run evaluation\n",
    "        if RUN_EVALUATION:\n",
    "            eval_results = run_eval(X_high = X, \n",
    "                                    X_low = embedding,\n",
    "                                    neighbors_high = nn, \n",
    "                                    n_sample = n_sample,\n",
    "                                    n_repetitions = n_repetitions,\n",
    "                                    neighborhood_size = n_neighbors, \n",
    "                                    run_drmetrics = False)\n",
    "            eval_results[\"run_time\"] = time_stop - time_start\n",
    "            # Save evaluation settings\n",
    "            eval_results[\"eval_settings\"] = {\n",
    "                \"n_points\":X.shape[0],\n",
    "                \"n_neighbors\":n_neighbors,\n",
    "                \"n_sample\":n_sample,\n",
    "                \"n_repetitions\":n_repetitions,\n",
    "                \"n_eval_runs\":n_eval_runs\n",
    "            }\n",
    "            evaluation_results[f\"{dataset_name}_{algo_name}_run{i}\"] = eval_results\n",
    "        \n",
    "        if RUN_EVALUATION:\n",
    "            evaluation_results_df = pd.DataFrame(evaluation_results).T.reset_index()\n",
    "            evaluation_results_df.to_csv(csv_path, index=False)\n",
    "    \n",
    "        if SAVE_EMBEDDINGS:\n",
    "            np.save(embeddings_path, embeddings[-1])\n",
    "    \n",
    "    return {f\"{dataset_name}_{algo_name}\": embeddings[-1]}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def run_dr_on_dataset(dataset_name, \n",
    "                      N_SAMPLE = -1, \n",
    "                      n_eval_runs = 1, \n",
    "                      n_sample = 1000, \n",
    "                      n_repetitions = 10, \n",
    "                      n_neighbors = 100, \n",
    "                      RUN_EVALUATION = True,\n",
    "                      SAVE_EMBEDDINGS = False):\n",
    "    # Create folder and path to save results\n",
    "    if not os.path.exists(f\"results/{dataset_name}\"):\n",
    "        os.makedirs(f\"results/{dataset_name}\")\n",
    "    \n",
    "    # Load data. N_SAMPLE points can be used to speed up computation\n",
    "    X = datasets[dataset_name]\n",
    "    if N_SAMPLE > 0 and N_SAMPLE < X.shape[0]:\n",
    "        rng = np.random.default_rng(42)\n",
    "        X = X[rng.choice(X.shape[0], N_SAMPLE, replace=False)]\n",
    "    \n",
    "    print(f\"---Starting evaluation for dataset {dataset_name} with {X.shape[0]} points---\")\n",
    "    \n",
    "    # Compute high-dimensional nearest neighbors\n",
    "    nn = None\n",
    "    if RUN_EVALUATION:\n",
    "        knn_high = NearestNeighbors(n_neighbors=n_neighbors+1, n_jobs=-1).fit(X)\n",
    "        nn = knn_high.kneighbors(X, n_neighbors=n_neighbors+1, return_distance=False) \n",
    "    \n",
    "    results = {}\n",
    "    for algo_name in algo_names:\n",
    "        time_start = time.time()\n",
    "        results.update(run_dr(X, nn, dataset_name, algo_name, N_SAMPLE, n_eval_runs, n_sample, n_repetitions, n_neighbors, RUN_EVALUATION, SAVE_EMBEDDINGS))\n",
    "        print(f\"---Completed {algo_name} for dataset {dataset_name} in {time.time() - time_start}---\")\n",
    "    return results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "embeddings = {}\n",
    "\n",
    "for dataset in [\"rts\", \"pdl\", \"ioc\", \"mjf\"]:\n",
    "    results = run_dr_on_dataset(dataset, \n",
    "                                N_SAMPLE = 60000, # Sample N_SAMPLE points in the dataset\n",
    "                                n_eval_runs = 10, \n",
    "                                n_sample = 1000, # Sample n_sample points to run some metrics on, to speed up computation\n",
    "                                n_repetitions = 10, \n",
    "                                n_neighbors = 100, \n",
    "                                RUN_EVALUATION = False,\n",
    "                                SAVE_EMBEDDINGS = False)\n",
    "    embeddings.update(results)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Visualisation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "d = 3\n",
    "b = 0.01 # border of zoomed in region plot\n",
    "n_rows = len(datasets_names)\n",
    "n_cols = len(algo_names)\n",
    "\n",
    "fig, axs = plt.subplots(n_rows, n_cols, figsize=(d * n_cols, d * n_rows))\n",
    "for i, dataset in enumerate(datasets_names):\n",
    "    for j, algo_name in enumerate(algo_names):\n",
    "        ax = axs[i, j]\n",
    "        ax.scatter(embeddings[f\"{dataset}_{algo_name}\"][:, 0], embeddings[f\"{dataset}_{algo_name}\"][:, 1], s=0.005, c=SINGLE_COLOR)\n",
    "        #ax.set_title(f\"{dataset} - {algo_name}\", fontsize=10, fontweight='bold')\n",
    "        ax.set_xticks([])\n",
    "        ax.set_yticks([])\n",
    "        if j == 0:\n",
    "            ax.set_ylabel(dataset.upper(), fontsize=14, fontweight='bold')\n",
    "        if i == 0:\n",
    "            ax.set_title(algo_name.upper(), fontsize=14, fontweight='bold')\n",
    "        \n",
    "        # Add zoomed in region for rts_trimap and ioc_trimap\n",
    "        if dataset == \"rts\" and algo_name == \"trimap\":\n",
    "            x_min, x_max = -1500, 1500\n",
    "            y_min, y_max = -2000, 3500\n",
    "            w = 0.8 # width of zoomed in region\n",
    "            h = w * (x_max - x_min) / (y_max - y_min) \n",
    "            axins = ax.inset_axes([b, 1 - h - b, w, h]) \n",
    "            axins.scatter(embeddings[f\"{dataset}_{algo_name}\"][:, 0], embeddings[f\"{dataset}_{algo_name}\"][:, 1], s=0.05, alpha = 0.5, c=SINGLE_COLOR)\n",
    "            axins.set_xlim(x_min, x_max)\n",
    "            axins.set_ylim(y_min, y_max)\n",
    "            axins.set_xticks([])\n",
    "            axins.set_yticks([])\n",
    "            ax.indicate_inset_zoom(axins, edgecolor=\"red\")\n",
    "        if dataset == \"ioc\" and algo_name == \"trimap\":\n",
    "            x_min, x_max = -75, 100\n",
    "            y_min, y_max = -150, 100\n",
    "            h = 0.4 # height of zoomed in region\n",
    "            w = h * (y_max - y_min) / (x_max - x_min)  \n",
    "            axins = ax.inset_axes([1 - w - b, b, w, h])\n",
    "            axins.scatter(embeddings[f\"{dataset}_{algo_name}\"][:, 0], embeddings[f\"{dataset}_{algo_name}\"][:, 1], s=0.05, alpha = 0.5, c=SINGLE_COLOR)\n",
    "            axins.set_xlim(x_min, x_max)\n",
    "            axins.set_ylim(y_min, y_max)\n",
    "            axins.set_xticks([])\n",
    "            axins.set_yticks([])\n",
    "            ax.indicate_inset_zoom(axins, edgecolor=\"red\")\n",
    "        \n",
    "plt.tight_layout()\n",
    "plt.savefig(\"images/embeddings_comparison.png\", dpi=300, bbox_inches='tight')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "dr_eval",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
